home *** CD-ROM | disk | FTP | other *** search
Wrap
# Source Generated with Decompyle++ # File: in.pyc (Python 2.6) '''Unit tests for Beautiful Soup. These tests make sure the Beautiful Soup works as it should. If you find a bug in Beautiful Soup, the best way to express it is as a test case like this that fails.''' import unittest from BeautifulSoup import * class SoupTest(unittest.TestCase): def assertSoupEquals(self, toParse, rep = None, c = BeautifulSoup, encoding = None): '''Parse the given text and make sure its string rep is the other given text.''' if rep == None: rep = toParse obj = c(toParse) if encoding is None: rep2 = obj.decode() else: rep2 = obj.encode(encoding) self.assertEqual(rep2, rep) class FollowThatTag(SoupTest): '''Tests the various ways of fetching tags from a soup.''' def setUp(self): ml = '\n <a id="x">1</a>\n <A id="a">2</a>\n <b id="b">3</a>\n <b href="foo" id="x">4</a>\n <ac width=100>4</ac>' self.soup = BeautifulStoneSoup(ml) def testFindAllByName(self): matching = self.soup('a') self.assertEqual(len(matching), 2) self.assertEqual(matching[0].name, 'a') self.assertEqual(matching, self.soup.findAll('a')) self.assertEqual(matching, self.soup.findAll(SoupStrainer('a'))) def testFindAllByAttribute(self): matching = self.soup.findAll(id = 'x') self.assertEqual(len(matching), 2) self.assertEqual(matching[0].name, 'a') self.assertEqual(matching[1].name, 'b') matching2 = self.soup.findAll(attrs = { 'id': 'x' }) self.assertEqual(matching, matching2) strainer = SoupStrainer(attrs = { 'id': 'x' }) self.assertEqual(matching, self.soup.findAll(strainer)) self.assertEqual(len(self.soup.findAll(id = None)), 1) self.assertEqual(len(self.soup.findAll(width = 100)), 1) self.assertEqual(len(self.soup.findAll(junk = None)), 5) self.assertEqual(len(self.soup.findAll(junk = [ 1, None])), 5) self.assertEqual(len(self.soup.findAll(junk = re.compile('.*'))), 0) self.assertEqual(len(self.soup.findAll(junk = True)), 0) self.assertEqual(len(self.soup.findAll(junk = True)), 0) self.assertEqual(len(self.soup.findAll(href = True)), 1) def testFindallByClass(self): soup = BeautifulSoup('<a>Foo</a><a class="1">Bar</a>') self.assertEqual(soup.find('a', '1').string, 'Bar') def testFindAllByList(self): matching = self.soup([ 'a', 'ac']) self.assertEqual(len(matching), 3) def testFindAllByHash(self): matching = self.soup({ 'a': True, 'b': True }) self.assertEqual(len(matching), 4) def testFindAllText(self): soup = BeautifulSoup('<html>\xbb</html>') self.assertEqual(soup.findAll(text = re.compile('.*')), [ u'┬╗']) def testFindAllByRE(self): import re r = re.compile('a.*') self.assertEqual(len(self.soup(r)), 3) def testFindAllByMethod(self): def matchTagWhereIDMatchesName(tag): return tag.name == tag.get('id') matching = self.soup.findAll(matchTagWhereIDMatchesName) self.assertEqual(len(matching), 2) self.assertEqual(matching[0].name, 'a') def testParents(self): soup = BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah') b = soup.b self.assertEquals(len(b.findParents('ul', { 'id': 'foo' })), 2) self.assertEquals(b.findParent('ul')['a'], 'b') PROXIMITY_TEST = BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">') def testNext(self): soup = self.PROXIMITY_TEST b = soup.find('b', { 'id': 2 }) self.assertEquals(b.findNext('b')['id'], '3') self.assertEquals(b.findNext('b')['id'], '3') self.assertEquals(len(b.findAllNext('b')), 2) self.assertEquals(len(b.findAllNext('b', { 'id': 4 })), 1) def testPrevious(self): soup = self.PROXIMITY_TEST b = soup.find('b', { 'id': 3 }) self.assertEquals(b.findPrevious('b')['id'], '2') self.assertEquals(b.findPrevious('b')['id'], '2') self.assertEquals(len(b.findAllPrevious('b')), 2) self.assertEquals(len(b.findAllPrevious('b', { 'id': 2 })), 1) SIBLING_TEST = BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">') def testNextSibling(self): soup = self.SIBLING_TEST tag = 'blockquote' b = soup.find(tag, { 'id': 2 }) self.assertEquals(b.findNext(tag)['id'], '2.1') self.assertEquals(b.findNextSibling(tag)['id'], '3') self.assertEquals(b.findNextSibling(tag)['id'], '3') self.assertEquals(len(b.findNextSiblings(tag)), 2) self.assertEquals(len(b.findNextSiblings(tag, { 'id': 4 })), 1) def testPreviousSibling(self): soup = self.SIBLING_TEST tag = 'blockquote' b = soup.find(tag, { 'id': 3 }) self.assertEquals(b.findPrevious(tag)['id'], '2.1') self.assertEquals(b.findPreviousSibling(tag)['id'], '2') self.assertEquals(b.findPreviousSibling(tag)['id'], '2') self.assertEquals(len(b.findPreviousSiblings(tag)), 2) self.assertEquals(len(b.findPreviousSiblings(tag, id = 1)), 1) def testTextNavigation(self): soup = BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh') baz = soup.find(text = 'Baz') self.assertEquals(baz.findParent('i')['id'], '1') self.assertEquals(baz.findNext(text = 'Blee'), 'Blee') self.assertEquals(baz.findNextSibling(text = 'Blee'), 'Blee') self.assertEquals(baz.findNextSibling(text = 'Blargh'), None) self.assertEquals(baz.findNextSibling('hr')['id'], '1') class SiblingRivalry(SoupTest): '''Tests the nextSibling and previousSibling navigation.''' def testSiblings(self): soup = BeautifulSoup('<ul><li>1<p>A</p>B<li>2<li>3</ul>') secondLI = soup.find('li').nextSibling if secondLI.name == 'li': pass self.assert_(secondLI.string == '2') self.assertEquals(soup.find(text = '1').nextSibling.name, 'p') self.assertEquals(soup.find('p').nextSibling, 'B') self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B') class TagsAreObjectsToo(SoupTest): '''Tests the various built-in functions of Tag objects.''' def testLen(self): soup = BeautifulSoup('<top>1<b>2</b>3</top>') self.assertEquals(len(soup.top), 3) class StringEmUp(SoupTest): """Tests the use of 'string' as an alias for a tag's only content.""" def testString(self): s = BeautifulSoup('<b>foo</b>') self.assertEquals(s.b.string, 'foo') def testLackOfString(self): s = BeautifulSoup('<b>f<i>e</i>o</b>') self.assert_(not (s.b.string)) class ThatsMyLimit(SoupTest): '''Tests the limit argument.''' def testBasicLimits(self): s = BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />') self.assertEquals(len(s.findAll('br')), 4) self.assertEquals(len(s.findAll('br', limit = 2)), 2) self.assertEquals(len(s('br', limit = 2)), 2) class OnlyTheLonely(SoupTest): '''Tests the parseOnly argument to the constructor.''' def setUp(self): x = [] for i in range(1, 6): x.append('<a id="%s">' % i) for j in range(100, 103): x.append('<b id="%s.%s">Content %s.%s</b>' % (i, j, i, j)) x.append('</a>') self.x = ''.join(x) def testOnly(self): strainer = SoupStrainer('b') soup = BeautifulSoup(self.x, parseOnlyThese = strainer) self.assertEquals(len(soup), 15) strainer = SoupStrainer(id = re.compile('100.*')) soup = BeautifulSoup(self.x, parseOnlyThese = strainer) self.assertEquals(len(soup), 5) strainer = SoupStrainer(text = re.compile('10[01].*')) soup = BeautifulSoup(self.x, parseOnlyThese = strainer) self.assertEquals(len(soup), 10) strainer = SoupStrainer(text = (lambda x: x[8] == '3')) soup = BeautifulSoup(self.x, parseOnlyThese = strainer) self.assertEquals(len(soup), 3) class PickleMeThis(SoupTest): '''Testing features like pickle and deepcopy.''' def setUp(self): self.page = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"\n"http://www.w3.org/TR/REC-html40/transitional.dtd">\n<html>\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<title>Beautiful Soup: We called him Tortoise because he taught us.</title>\n<link rev="made" href="mailto:leonardr@segfault.org">\n<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">\n<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">\n<meta name="author" content="Leonard Richardson">\n</head>\n<body>\n<a href="foo">foo</a>\n<a href="foo"><b>bar</b></a>\n</body>\n</html>' self.soup = BeautifulSoup(self.page) def testPickle(self): import pickle dumped = pickle.dumps(self.soup, 2) loaded = pickle.loads(dumped) self.assertEqual(loaded.__class__, BeautifulSoup) self.assertEqual(loaded.decode(), self.soup.decode()) def testDeepcopy(self): deepcopy = deepcopy import copy deepcopy(BeautifulSoup('<a></a>')) copied = deepcopy(self.soup) self.assertEqual(copied.decode(), self.soup.decode()) def testUnicodePickle(self): import cPickle as pickle html = '<b>' + chr(195) + '</b>' soup = BeautifulSoup(html) dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) loaded = pickle.loads(dumped) self.assertEqual(loaded.decode(), soup.decode()) class WriteOnlyCode(SoupTest): '''Testing the modification of the tree.''' def testModifyAttributes(self): soup = BeautifulSoup('<a id="1"></a>') soup.a['id'] = 2 self.assertEqual(soup.decode(), '<a id="2"></a>') del soup.a['id'] self.assertEqual(soup.decode(), '<a></a>') soup.a['id2'] = 'foo' self.assertEqual(soup.decode(), '<a id2="foo"></a>') def testNewTagCreation(self): """Makes sure tags don't step on each others' toes.""" soup = BeautifulSoup() a = Tag(soup, 'a') ol = Tag(soup, 'ol') a['href'] = 'http://foo.com/' self.assertRaises((KeyError,), (lambda : ol['href'])) def testTagReplacement(self): text = '<a><b></b><c>Foo<d></d></c></a><a><e></e></a>' soup = BeautifulSoup(text) c = soup.c soup.c.replaceWith(c) self.assertEquals(soup.decode(), text) soup = BeautifulSoup('<b>Argh!</b>') soup.find(text = 'Argh!').replaceWith('Hooray!') newText = soup.find(text = 'Hooray!') b = soup.b self.assertEqual(newText.previous, b) self.assertEqual(newText.parent, b) self.assertEqual(newText.previous.next, newText) self.assertEqual(newText.next, None) soup = BeautifulSoup('<a><b>Argh!</b><c></c><d></d></a>') soup.b.insert(1, 'Hooray!') newText = soup.find(text = 'Hooray!') self.assertEqual(newText.previous, 'Argh!') self.assertEqual(newText.previous.next, newText) self.assertEqual(newText.previousSibling, 'Argh!') self.assertEqual(newText.previousSibling.nextSibling, newText) self.assertEqual(newText.nextSibling, None) self.assertEqual(newText.next, soup.c) text = "<html>There's <b>no</b> business like <b>show</b> business</html>" soup = BeautifulSoup(text) (no, show) = soup.findAll('b') show.replaceWith(no) self.assertEquals(soup.decode(), "<html>There's business like <b>no</b> business</html>") soup = BeautifulSoup('<a><b>Find</b><c>lady!</c><d></d></a>') tag = Tag(soup, 'magictag') tag.insert(0, 'the') soup.a.insert(1, tag) b = soup.b c = soup.c theText = tag.find(text = True) findText = b.find(text = 'Find') self.assertEqual(findText.next, tag) self.assertEqual(tag.previous, findText) self.assertEqual(b.nextSibling, tag) self.assertEqual(tag.previousSibling, b) self.assertEqual(tag.nextSibling, c) self.assertEqual(c.previousSibling, tag) self.assertEqual(theText.next, c) self.assertEqual(c.previous, theText) soup = BeautifulSoup('<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>') f = soup.f a = soup.a c = soup.c e = soup.e weText = a.find(text = 'We') soup.b.replaceWith(soup.f) self.assertEqual(soup.decode(), '<a>We<f>refuse</f></a><e>to<g>service</g></e>') self.assertEqual(f.previous, weText) self.assertEqual(weText.next, f) self.assertEqual(f.previousSibling, weText) self.assertEqual(f.nextSibling, None) self.assertEqual(weText.nextSibling, f) def testAppend(self): doc = "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>" soup = BeautifulSoup(doc) second_para = soup('p')[1] bold = soup.find('b') soup('p')[1].append(soup.find('b')) self.assertEqual(bold.parent, second_para) self.assertEqual(soup.decode(), "<p>Don't leave me .</p> <p>Don't leave me.<b>here</b></p>") def testTagExtraction(self): text = '<html><div id="nav">Nav crap</div>Real content here.</html>' soup = BeautifulSoup(text) extracted = soup.find('div', id = 'nav').extract() self.assertEqual(soup.decode(), '<html>Real content here.</html>') self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>') text = '<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>' soup = BeautifulStoneSoup(text) doc = soup.doc (numbers, roman, letters) = soup('a') self.assertEqual(roman.parent, doc) oldPrevious = roman.previous endOfThisTag = roman.nextSibling.previous self.assertEqual(oldPrevious, '2') self.assertEqual(roman.next, 'i') self.assertEqual(endOfThisTag, 'ii') self.assertEqual(roman.previousSibling, numbers) self.assertEqual(roman.nextSibling, letters) roman.extract() self.assertEqual(roman.parent, None) self.assertEqual(roman.previous, None) self.assertEqual(roman.next, 'i') self.assertEqual(letters.previous, '2') self.assertEqual(roman.previousSibling, None) self.assertEqual(roman.nextSibling, None) self.assertEqual(endOfThisTag.next, None) self.assertEqual(roman.b.contents[0].next, None) self.assertEqual(numbers.nextSibling, letters) self.assertEqual(letters.previousSibling, numbers) self.assertEqual(len(doc.contents), 2) self.assertEqual(doc.contents[0], numbers) self.assertEqual(doc.contents[1], letters) text = '<a>1<b>2<c>Hollywood, baby!</c></b></a>3' soup = BeautifulStoneSoup(text) one = soup.find(text = '1') three = soup.find(text = '3') toExtract = soup.b soup.b.extract() self.assertEqual(one.next, three) self.assertEqual(three.previous, one) self.assertEqual(one.parent.nextSibling, three) self.assertEqual(three.previousSibling, soup.a) class TheManWithoutAttributes(SoupTest): '''Test attribute access''' def testHasKey(self): text = "<foo attr='bar'>" self.assertTrue(BeautifulSoup(text).foo.has_key('attr')) class QuoteMeOnThat(SoupTest): '''Test quoting''' def testQuotedAttributeValues(self): self.assertSoupEquals("<foo attr='bar'></foo>", '<foo attr="bar"></foo>') text = '<foo attr=\'bar "brawls" happen\'>a</foo>' soup = BeautifulSoup(text) self.assertEquals(soup.decode(), text) soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' newText = '<foo attr=\'Brawls happen at "Bob&squot;s Bar"\'>a</foo>' self.assertSoupEquals(soup.decode(), newText) self.assertSoupEquals('<this is="really messed up & stuff">', '<this is="really messed up & stuff"></this>') class YoureSoLiteral(SoupTest): '''Test literal mode.''' def testLiteralMode(self): text = '<script>if (i<imgs.length)</script><b>Foo</b>' soup = BeautifulSoup(text) self.assertEqual(soup.script.contents[0], 'if (i<imgs.length)') self.assertEqual(soup.b.contents[0], 'Foo') def testTextArea(self): text = '<textarea><b>This is an example of an HTML tag</b><&<&</textarea>' soup = BeautifulSoup(text) self.assertEqual(soup.textarea.contents[0], '<b>This is an example of an HTML tag</b><&<&') class OperatorOverload(SoupTest): '''Our operators do it all! Call now!''' def testTagNameAsFind(self): '''Tests that referencing a tag name as a member delegates to find().''' soup = BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>') self.assertEqual(soup.b.i, soup.find('b').find('i')) self.assertEqual(soup.b.i.string, 'bar') self.assertEqual(soup.b['id'], '1') self.assertEqual(soup.b.contents[0], 'foo') self.assert_(not (soup.a)) self.assertEqual(soup.bTag.iTag.string, 'bar') self.assertEqual(soup.b.iTag.string, 'bar') self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag) class NestableEgg(SoupTest): '''Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!''' def testParaInsideBlockquote(self): soup = BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar') self.assertEqual(soup.blockquote.p.b.string, 'Foo') self.assertEqual(soup.blockquote.b.string, 'Foo') self.assertEqual(soup.find('p', recursive = False).string, 'Bar') def testNestedTables(self): text = '<table id="1"><tr><td>Here\'s another table:\n <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>' soup = BeautifulSoup(text) self.assertEquals(soup.table.table.td.string, 'Juicy text') self.assertEquals(len(soup.findAll('table')), 2) self.assertEquals(len(soup.table.findAll('table')), 1) self.assertEquals(soup.find('table', { 'id': 2 }).parent.parent.parent.name, 'table') text = '<table><tr><td><div><table>Foo</table></div></td></tr></table>' soup = BeautifulSoup(text) self.assertEquals(soup.table.tr.td.div.table.contents[0], 'Foo') text = '<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody>\n <tfoot><tr>Baz</tr></tfoot></table>' soup = BeautifulSoup(text) self.assertEquals(soup.table.thead.tr.contents[0], 'Foo') def testBadNestedTables(self): soup = BeautifulSoup("<table><tr><table><tr id='nested'>") self.assertEquals(soup.table.tr.table.tr['id'], 'nested') class CleanupOnAisleFour(SoupTest): '''Here we test cleanup of text that breaks HTMLParser or is just obnoxious.''' def testSelfClosingtag(self): self.assertEqual(BeautifulSoup('Foo<br/>Bar').find('br').decode(), '<br />') self.assertSoupEquals('<p>test1<br/>test2</p>', '<p>test1<br />test2</p>') text = '<p>test1<selfclosing>test2' soup = BeautifulStoneSoup(text) self.assertEqual(soup.decode(), '<p>test1<selfclosing>test2</selfclosing></p>') soup = BeautifulStoneSoup(text, selfClosingTags = 'selfclosing') self.assertEqual(soup.decode(), '<p>test1<selfclosing />test2</p>') def testSelfClosingTagOrNot(self): text = '<item><link>http://foo.com/</link></item>' self.assertEqual(BeautifulStoneSoup(text).decode(), text) self.assertEqual(BeautifulSoup(text).decode(), '<item><link />http://foo.com/</item>') def testBooleanAttributes(self): text = '<td nowrap>foo</td>' self.assertSoupEquals(text, text) def testCData(self): xml = '<root>foo<![CDATA[foobar]]>bar</root>' self.assertSoupEquals(xml, xml) r = re.compile('foo.*bar') soup = BeautifulSoup(xml) self.assertEquals(soup.find(text = r).string, 'foobar') self.assertEquals(soup.find(text = r).__class__, CData) def testComments(self): xml = 'foo<!--foobar-->baz' self.assertSoupEquals(xml) r = re.compile('foo.*bar') soup = BeautifulSoup(xml) self.assertEquals(soup.find(text = r).string, 'foobar') self.assertEquals(soup.find(text = 'foobar').__class__, Comment) def testDeclaration(self): xml = 'foo<!DOCTYPE foobar>baz' self.assertSoupEquals(xml) r = re.compile('.*foo.*bar') soup = BeautifulSoup(xml) text = 'DOCTYPE foobar' self.assertEquals(soup.find(text = r).string, text) self.assertEquals(soup.find(text = text).__class__, Declaration) namespaced_doctype = '<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"><html>foo</html>' soup = BeautifulSoup(namespaced_doctype) self.assertEquals(soup.contents[0], 'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"') self.assertEquals(soup.html.contents[0], 'foo') def testEntityConversions(self): text = '<<sacré bleu!>>' soup = BeautifulStoneSoup(text) self.assertSoupEquals(text) xmlEnt = BeautifulStoneSoup.XML_ENTITIES htmlEnt = BeautifulStoneSoup.HTML_ENTITIES xhtmlEnt = BeautifulStoneSoup.XHTML_ENTITIES soup = BeautifulStoneSoup(text, convertEntities = xmlEnt) self.assertEquals(soup.decode(), '<<sacré bleu!>>') soup = BeautifulStoneSoup(text, convertEntities = xmlEnt) self.assertEquals(soup.decode(), '<<sacré bleu!>>') soup = BeautifulStoneSoup(text, convertEntities = htmlEnt) self.assertEquals(soup.decode(), u'<<sacr├⌐ bleu!>>') text = '<™'' soup = BeautifulStoneSoup(text, convertEntities = xmlEnt) self.assertEquals(soup.decode(), u"<™'") soup = BeautifulStoneSoup(text, convertEntities = htmlEnt) self.assertEquals(soup.decode(), u'<Γäó'') soup = BeautifulStoneSoup(text, convertEntities = xhtmlEnt) self.assertEquals(soup.decode(), u"<Γäó'") def testNonBreakingSpaces(self): soup = BeautifulSoup('<a> </a>', convertEntities = BeautifulStoneSoup.HTML_ENTITIES) self.assertEquals(soup.decode(), u'<a>┬á┬á</a>') def testWhitespaceInDeclaration(self): self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>') def testJunkInDeclaration(self): self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a') def testIncompleteDeclaration(self): self.assertSoupEquals('a<!b <p>c') def testEntityReplacement(self): self.assertSoupEquals('<b>hello there</b>') def testEntitiesInAttributeValues(self): self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', encoding = 'utf-8') self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', encoding = 'utf-8') soup = BeautifulSoup('<x t=">™">', convertEntities = BeautifulStoneSoup.HTML_ENTITIES) self.assertEquals(soup.decode(), u'<x t=">Γäó"></x>') uri = 'http://crummy.com?sacré&bleu' link = '<a href="%s"></a>' % uri soup = BeautifulSoup(link, convertEntities = BeautifulSoup.HTML_ENTITIES) self.assertEquals(soup.decode(), link.replace('é', u'├⌐')) uri = 'http://crummy.com?sacré&bleu' link = '<a href="%s"></a>' % uri soup = BeautifulSoup(link, convertEntities = BeautifulSoup.HTML_ENTITIES) self.assertEquals(soup.a['href'], uri.replace('é', u'├⌐')) def testNakedAmpersands(self): html = { 'convertEntities': BeautifulStoneSoup.HTML_ENTITIES } soup = BeautifulStoneSoup('AT&T ', **html) self.assertEquals(soup.decode(), 'AT&T ') nakedAmpersandInASentence = 'AT&T was Ma Bell' soup = BeautifulStoneSoup(nakedAmpersandInASentence, **html) self.assertEquals(soup.decode(), nakedAmpersandInASentence.replace('&', '&')) invalidURL = '<a href="http://example.org?a=1&b=2;3">foo</a>' validURL = invalidURL.replace('&', '&') soup = BeautifulStoneSoup(invalidURL) self.assertEquals(soup.decode(), validURL) soup = BeautifulStoneSoup(validURL) self.assertEquals(soup.decode(), validURL) class EncodeRed(SoupTest): '''Tests encoding conversion, Unicode conversion, and Microsoft smart quote fixes.''' def testUnicodeDammitStandalone(self): markup = '<foo>\x92</foo>' dammit = UnicodeDammit(markup) self.assertEquals(dammit.unicode, '<foo>’</foo>') hebrew = '\xed\xe5\xec\xf9' dammit = UnicodeDammit(hebrew, [ 'iso-8859-8']) self.assertEquals(dammit.unicode, u'╫¥╫ò╫£╫⌐') self.assertEquals(dammit.originalEncoding, 'iso-8859-8') def testGarbageInGarbageOut(self): ascii = '<foo>a</foo>' asciiSoup = BeautifulStoneSoup(ascii) self.assertEquals(ascii, asciiSoup.decode()) unicodeData = u'<foo>├╝</foo>' utf8 = unicodeData.encode('utf-8') self.assertEquals(utf8, '<foo>\xc3\xbc</foo>') unicodeSoup = BeautifulStoneSoup(unicodeData) self.assertEquals(unicodeData, unicodeSoup.decode()) self.assertEquals(unicodeSoup.foo.string, u'├╝') utf8Soup = BeautifulStoneSoup(utf8, fromEncoding = 'utf-8') self.assertEquals(utf8, utf8Soup.encode('utf-8')) self.assertEquals(utf8Soup.originalEncoding, 'utf-8') utf8Soup = BeautifulStoneSoup(unicodeData) self.assertEquals(utf8, utf8Soup.encode('utf-8')) self.assertEquals(utf8Soup.originalEncoding, None) def testHandleInvalidCodec(self): for bad_encoding in [ '.utf8', '...', 'utF---16.!']: soup = BeautifulSoup(u'R├ñksm├╢rg├Ñs'.encode('utf-8'), fromEncoding = bad_encoding) self.assertEquals(soup.originalEncoding, 'utf-8') def testUnicodeSearch(self): html = u'<html><body><h1>R├ñksm├╢rg├Ñs</h1></body></html>' soup = BeautifulSoup(html) self.assertEqual(soup.find(text = u'R├ñksm├╢rg├Ñs'), u'R├ñksm├╢rg├Ñs') def testRewrittenXMLHeader(self): euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n' utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n" soup = BeautifulStoneSoup(euc_jp) if soup.originalEncoding != 'euc-jp': raise Exception("Test failed when parsing euc-jp document. If you're running Python >=2.4, or you have cjkcodecs installed, this is a real problem. Otherwise, ignore it.") soup.originalEncoding != 'euc-jp' self.assertEquals(soup.originalEncoding, 'euc-jp') self.assertEquals(soup.renderContents('utf-8'), utf8) old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>" new_text = "<?xml version='1.0' encoding='utf-8'?><foo>’</foo>" self.assertSoupEquals(old_text, new_text) def testRewrittenMetaTag(self): no_shift_jis_html = '<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>' soup = BeautifulSoup(no_shift_jis_html) strainer = SoupStrainer('pre') soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese = strainer) self.assertEquals(soup.contents[0].name, 'pre') meta_tag = '<meta content="text/html; charset=x-sjis" http-equiv="Content-type" />' shift_jis_html = '<html><head>\n%s\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>' % meta_tag soup = BeautifulSoup(shift_jis_html) if soup.originalEncoding != 'shift-jis': raise Exception("Test failed when parsing shift-jis document with meta tag '%s'.If you're running Python >=2.4, or you have cjkcodecs installed, this is a real problem. Otherwise, ignore it." % meta_tag) soup.originalEncoding != 'shift-jis' self.assertEquals(soup.originalEncoding, 'shift-jis') content_type_tag = soup.meta['content'] self.assertEquals(content_type_tag[content_type_tag.find('charset='):], 'charset=%SOUP-ENCODING%') content_type = str(soup.meta) index = content_type.find('charset=') self.assertEqual(content_type[index:index + len('charset=utf8') + 1], 'charset=utf-8') content_type = soup.meta.encode('shift-jis') index = content_type.find('charset=') self.assertEqual(content_type[index:index + len('charset=shift-jis')], 'charset=shift-jis'.encode()) self.assertEquals(soup.encode('utf-8'), '<html><head>\n<meta content="text/html; charset=utf-8" http-equiv="Content-type" />\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</pre></body></html>') self.assertEquals(soup.encode('shift-jis'), shift_jis_html.replace('x-sjis'.encode(), 'shift-jis'.encode())) isolatin = '<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>' soup = BeautifulSoup(isolatin) utf8 = isolatin.replace('ISO-Latin-1'.encode(), 'utf-8'.encode()) utf8 = utf8.replace('\xe9', '\xc3\xa9') self.assertSoupEquals(soup.encode('utf-8'), utf8, encoding = 'utf-8') def testHebrew(self): iso_8859_8 = '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n' utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n' soup = BeautifulStoneSoup(iso_8859_8, fromEncoding = 'iso-8859-8') self.assertEquals(soup.encode('utf-8'), utf8) def testSmartQuotesNotSoSmartAnymore(self): self.assertSoupEquals('\x91Foo\x92 <!--blah-->', '‘Foo’ <!--blah-->') def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self): smartQuotes = 'Il a dit, \x8bSacré bleu!\x9b' soup = BeautifulSoup(smartQuotes) self.assertEquals(soup.decode(), 'Il a dit, ‹Sacré bleu!›') soup = BeautifulSoup(smartQuotes, convertEntities = 'html') self.assertEquals(soup.encode('utf-8'), 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') def testDontSeeSmartQuotesWhereThereAreNone(self): utf_8 = '\xe3\x82\xb1\xe3\x83\xbc\xe3\x82\xbf\xe3\x82\xa4 Watch' self.assertSoupEquals(utf_8, encoding = 'utf-8') class Whitewash(SoupTest): '''Test whitespace preservation.''' def testPreservedWhitespace(self): self.assertSoupEquals('<pre> </pre>') self.assertSoupEquals('<pre> woo </pre>') def testCollapsedWhitespace(self): self.assertSoupEquals('<p> </p>', '<p> </p>') if __name__ == '__main__': unittest.main()